Curious what congress and Trump are saying about the coronavirus?

This analysis takes a look at the number, the positive and negative sentiment, and the content of the tweets, for each party and how that’s evolving over time since February 1, 2020.

I utilize open data hosted online. In particular, big thanks to Alex Litel who created the Tweets of Congress repo, where I pulled congressional tweets from, and the folks running Trump Twitter Archive, where I pulled Trump’s tweets from.

The repo for this project is: https://github.com/dcosme/congress-tweets-covid19

prep data

load packages

library(tidyverse)
library(jsonlite)
library(tidytext)
library(ggwordcloud)
library(knitr)

define palettes

palette4 = wesanderson::wes_palette("Zissou1", 4, "continuous")
palette4 = c(palette4[1], palette4[2], palette4[4], palette4[3])
palette5 = wesanderson::wes_palette("Zissou1", 5, "continuous")
palette2 = c(palette5[2], palette5[4])

load congress twitter handles

congress_twitter = read.csv("~/Documents/code/US-Congress/116thCongress/116Congress.csv", stringsAsFactors = FALSE) %>%
  rename("name" = Wikipedia..Names) %>%
  gather(handle_type, twitter_handle, ODU.WSDL, CSPAN, TweetCongress, Github) %>%
  select(name, handle_type, twitter_handle) %>%
  mutate(twitter_handle = tolower(twitter_handle),
         twitter_handle = ifelse(twitter_handle == "", NA, twitter_handle),
         name = gsub("<e9>", "é", `Encoding<-`(name, "latin1"), fixed = TRUE),
         name = gsub("<e1>", "á", `Encoding<-`(name, "latin1"), fixed = TRUE),
         name = gsub("<fa>", "ú", `Encoding<-`(name, "latin1"), fixed = TRUE),
         name = gsub("<ed>", "í", `Encoding<-`(name, "latin1"), fixed = TRUE),
         name = gsub("é", "e", name),
         name = gsub("á", "a", name),
         name = gsub("ú", "u", name),
         name = gsub("í", "i", name),
         name = trimws(name)) %>%
  extract(name, c("first", "last"), "([A-Za-z]{1}).* (.*)", remove = FALSE) %>%
  spread(handle_type, twitter_handle)

congress = read.csv("~/Documents/code/us-senate/us-senate/data/us-senate.csv", stringsAsFactors = FALSE) %>%
  bind_rows(read.csv("~/Documents/code/us-house/us-house/data/us-house.csv", stringsAsFactors = FALSE)) %>%
  select(state_name, title, party, name, gender, ethnicity, twitter_handle) %>%
  mutate(twitter_handle = tolower(twitter_handle),
         twitter_handle = ifelse(twitter_handle == "", NA,
                          ifelse(twitter_handle == "housedemocrats", NA,
                          ifelse(twitter_handle == "senatorloeffler?lang=en", NA, twitter_handle))),
         name = gsub("é", "e", name),
         name = gsub("á", "a", name),
         name = gsub("ú", "u", name),
         name = gsub("í", "i", name),
         name = trimws(name)) %>%
  extract(name, c("first", "last"), "([A-Za-z]{1}).* (.*)", remove = FALSE)

congress_info = full_join(congress, congress_twitter, by = c("first", "last")) %>%
  gather(handle_type, twitter_handle, twitter_handle, ODU.WSDL, CSPAN, TweetCongress, Github) %>%
  select(state_name, title, party, first, last, gender, ethnicity, twitter_handle) %>%
  group_by(first, last) %>%
  fill(state_name, title, party, gender, ethnicity, twitter_handle, .direction = "updown") %>%
  unique() %>%
  filter(!is.na(state_name)) %>%
  ungroup() %>%
  mutate(last = tolower(last))

load congressional tweets

pull to update repo

cd ~/Documents/code/congresstweets
git pull origin master
## From https://github.com/alexlitel/congresstweets
##  * branch            master     -> FETCH_HEAD
## Already up to date.

load the files

file_dir = "~/Documents/code/congresstweets/data"
file_pattern = "2020-0[2-3]{1}-.*.json"
file_list = list.files(file_dir, pattern = file_pattern)

tweets_temp = data.frame()

for (file in file_list) {
  temp = tryCatch(jsonlite::stream_in(file(file.path(file_dir, file)), verbose = FALSE), error = function(e) message(file))

  tweets_temp = rbind(tweets_temp, temp)
  rm(temp)
}

tweets_all = tweets_temp %>%
  rename("twitter_handle" = screen_name) %>%
  select(twitter_handle, time, text) %>%
  mutate(twitter_handle = tolower(twitter_handle),
         text = tolower(text)) %>%
  filter(grepl("corona|virus|covid|flu", text))

find missing congressional twitter handles

missing = tweets_all %>% 
  left_join(., congress_info) %>%
  filter(is.na(last)) %>%
  select(twitter_handle) %>%
  unique()

last_names = congress_info %>%
  ungroup() %>%
  select(last) %>%
  unique() %>%
  mutate(last = tolower(last))

missing$last <- sapply(missing$twitter_handle, function(handle) {
  last <- last_names$last[sapply(last_names$last, grepl, handle)]
  }) 

missing %>%
  unnest(last, keep_empty = TRUE) %>%
  mutate(first = toupper(substring(twitter_handle, 1, 1)),
         last = ifelse(is.na(last), "", last)) %>%
  write.csv(., "missing.csv", row.names = FALSE)

missing_edited = read.csv("missing_edited.csv", stringsAsFactors = FALSE)

congress_full = congress_info %>%
  full_join(., missing_edited, by = c("first", "last")) %>%
  gather(var, twitter_handle, contains("twitter_handle")) %>%
  select(-var) %>%
  unique() %>%
  filter(!is.na(twitter_handle)) %>%
  mutate(party = ifelse(party.y == "" | is.na(party.y), party.x, party.y)) %>%
  select(-c(party.x, party.y)) %>%
  unique() %>%
  filter(!is.na(party))

congress_tweets = tweets_all %>%
  left_join(., congress_full) %>%
  mutate(time = lubridate::as_date(time),
         week = lubridate::floor_date(time, "week"),
         month = lubridate::floor_date(time, "month"))

# congress_tweets %>%
#   filter(is.na(party)) %>%
#   select(twitter_handle) %>%
#   unique()

load trump tweets

get_tweets = function(year, fromJSON = TRUE) {
  ## build and send request
  url <- paste0(
    "http://trumptwitterarchive.com/",
    "data/realdonaldtrump/",
    year,
    ".json"
  )
  ## response object
  r <- httr::GET(url)
  ## check html status
  httr::warn_for_status(r)
  ## if fromJSON then convert to list otherwise return response object
  if (fromJSON) {
    r <- httr::content(r, "text")
    ## if html return empty data frame
    if (grepl("^\\<\\!DOCTYPE", r)) {
      r <- data.frame()
    } else {
      r <- jsonlite::fromJSON(r)
    }
  }
  r
}

trump_tweets = get_tweets(year = 2020) %>%
  mutate(twitter_handle = "realdonaldtrump",
         time = as.POSIXct(created_at, format = "%a %b %d %H:%M:%S %z %Y"),
         time = lubridate::as_date(time),
         week = lubridate::floor_date(time, "week"),
         month = lubridate::floor_date(time, "month"),
         party = "trump") %>%
  select(twitter_handle, text, time, week, month, party) %>%
  filter(grepl("2020-02|2020-03", month)) %>%
  mutate(text = tolower(text)) %>%
  filter(grepl("corona|virus|covid|flu", text))

merge tweets

tweets = bind_rows(congress_tweets, trump_tweets)

ignore_root_words = "http|.com|img|jpg|video|live|index|[0-9]|corona|covid|vid|ncov|aspx|utm|t.co|png"
ignore_words = c("rt", "amp", "qt", "pu", "tag", "i'm", "it's", "i’m", "it’s", "lr", "li", "ag")

define wordcloud function

plot_tweets = function(data, party=NULL, start_date=NULL, duration=NULL, n_words=50, n_colors=6, size=20) {
  
  data = data %>%
      mutate(time = lubridate::as_date(time))
  
  if (!is.null(party)) {
    data = data %>%
      filter(party == !!party)
  }
  
  if (!is.null(start_date)) {
    if (!is.null(duration)) {
      data = data %>%
        filter(time >= start_date & time <= lubridate::date(start_date) + lubridate::days(duration))
    } else {
      data = data %>%
        filter(time >= start_date)
    }
  }
  
  palette = wesanderson::wes_palette("Zissou1", n_colors, "continuous")
  
  set.seed(42)
  
plot = data %>%
    filter(party %in% c("democrat", "republican", "trump")) %>%
    select(text, party) %>%
    unnest_tokens(word, text) %>%
    group_by(party) %>%
    count(word, sort = TRUE) %>%
    anti_join(stop_words, by = "word") %>%
    filter(!grepl(ignore_root_words, word)) %>%
    filter(!word %in% ignore_words) %>%
    slice(1:n_words) %>%
    mutate(word = gsub("\\.", "", word),
           sum = sum(n),
           size = n / sum,
           tile = ntile(n, n_colors)) %>%
    ggplot(aes(label = word, size = size, color = as.factor(tile))) +
    geom_text_wordcloud_area(shape = "square") +
      scale_size_area(max_size = size, trans = power_trans(1/.7)) +
      scale_color_manual(values = palette) +
      facet_wrap(~party) +
      theme_minimal() +
      theme(strip.text.x = element_text(size = 12))
  
  return(list(plot = plot, data = data))
}

number of tweets

How many times has Congress and the President tweeted about COVID-19?

daily

tweets %>%
  filter(!is.na(party)) %>%
  ggplot(aes(time, fill = party)) +
  geom_bar(stat = "count") +
  scale_fill_manual(name = "", values = palette4) +
  labs(x = "", y = "number of tweets\n") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top")

tweets %>%
  filter(!is.na(party)) %>%
  ggplot(aes(time, color = party)) +
  geom_line(stat = "count") +
  scale_color_manual(name = "", values = palette4) +
  labs(x = "", y = "number of tweets\n") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top")

weekly

tweets %>%
  filter(!is.na(party)) %>%
  ggplot(aes(week, fill = party)) +
  geom_bar(stat = "count") +
  scale_fill_manual(name = "", values = palette4) +
  labs(x = "", y = "number of tweets\n") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top")

tweets %>%
  filter(!is.na(party)) %>%
  ggplot(aes(week, color = party)) +
  geom_line(stat = "count") +
  scale_color_manual(name = "", values = palette4) +
  labs(x = "", y = "number of tweets\n") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top")

sentiment of tweets

How positive and negative is the content of the tweets?

Here is a list of the top 20 positive or negative words for each party and the President.

overall

sentiments = tweets %>%
  unnest_tokens(word, text) %>%
  inner_join(tidytext::get_sentiments("bing")) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!grepl(ignore_root_words, word)) %>%
  filter(!word %in% ignore_words) %>%
  filter(!word == "trump") %>%
  group_by(party) %>%
  count(word, sentiment, sort = TRUE) %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  #group_by(sentiment, party, month) %>%
  top_n(20) %>%
  group_by(party) %>%
  arrange(n) %>%
  mutate(order = row_number())

sentiments %>%
  ggplot(aes(drlib::reorder_within(word, n, party), n, fill = sentiment)) +
  geom_col() +
  drlib::scale_x_reordered() +
  facet_wrap(~party, scales = "free") +
  labs(y = "\nnumber of times tweeted",
       x = NULL) +
  coord_flip() +
  scale_fill_manual(name = "", values = palette2) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top")

by month

sentiments = tweets %>%
  unnest_tokens(word, text) %>%
  inner_join(tidytext::get_sentiments("bing")) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!grepl(ignore_root_words, word)) %>%
  filter(!word %in% ignore_words) %>%
  filter(!word == "trump") %>%
  mutate(month = ifelse(month == "2020-02-01", "february", "march")) %>%
  group_by(party, month) %>%
  count(word, sentiment, sort = TRUE) %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  #group_by(sentiment, party, month) %>%
  top_n(20) %>%
  group_by(party, month) %>%
  arrange(n) %>%
  mutate(order = row_number())

sentiments %>%
  ggplot(aes(drlib::reorder_within(word, n, c("party", "month")), n, fill = sentiment)) +
  geom_col() +
  drlib::scale_x_reordered() +
  facet_wrap(month~party, scales = "free") +
  labs(y = "\nnumber of times tweeted",
       x = NULL) +
  coord_flip() +
  scale_fill_manual(name = "", values = palette2) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top")

content of tweets

What are Congress and the President saying about COVID-19?

Here are 100 the most frequently used words by each party and the President in February and March.

by month

February

p = plot_tweets(tweets, start_date = "2020-02-01", duration = 29, n_words = 100)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 2015
republican 1160
trump 28

March

p = plot_tweets(tweets, start_date = "2020-03-01", duration = 31, n_words = 100)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 7820
republican 6992
trump 83

by week

Here are 50 the most frequently used words by each party and the President for each week in February and March.

Feb 1 - Feb 7

p = plot_tweets(tweets, start_date = "2020-02-01", duration = 7)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 212
republican 140
trump 2

Feb 8 - Feb 14

p = plot_tweets(tweets, start_date = "2020-02-08", duration = 7)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 180
republican 166
trump 1

Feb 15 - Feb 20

p = plot_tweets(tweets, start_date = "2020-02-15", duration = 7)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 110
republican 88
trump 1

Feb 20 - Feb 26

p = plot_tweets(tweets, start_date = "2020-02-20", duration = 7)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 928
republican 482
trump 18

Feb 27 - Mar 4

p = plot_tweets(tweets, start_date = "2020-02-27", duration = 7)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 2840
republican 2966
trump 28

Mar 5 - Mar 11

p = plot_tweets(tweets, start_date = "2020-03-05", duration = 7)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 4983
republican 3754
trump 50

Mar 12 - Mar 18

p = plot_tweets(tweets, start_date = "2020-03-12", duration = 7)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 2935
republican 2520
trump 39